!pip install git+http://git@github.com/kavgan/word_cloud.git
!pip install wordcloud
!pip install gensim
!pip install stop_words
import numpy as np
import io
import pandas as pd
from stop_words import get_stop_words
from nltk.tokenize import RegexpTokenizer
from gensim import corpora, models
import gensim
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from PIL import Image
import PIL.ImageOps
import random
from wordcloud import ImageColorGenerator
%matplotlib inline
%config InlineBackend.figure_format='retina'
df = pd.read_csv('small_700_through_710_descr_clm_code.csv')
df.drop('Unnamed: 0',axis=1, inplace=True)
df['descr_clm'] = df.descr + df.clm
df.drop(['descr','clm'],axis=1, inplace=True)
df['code'] = df['code'].astype('category')
df.head()
df_705 = df[df['code']==705]
df_705.head()
from wordcloud import STOPWORDS
custom_stopword_list = ['using','said','based','wherein','comprising','comprise','diagram','system','process','method','one', 'may','claim','embodiment','invention','include', 'example', 'include','step','figure','fig']
stopwords = STOPWORDS
stopwords |= set(custom_stopword_list)
text = df_705.descr_clm.values
wordcloud = WordCloud(
width = 3000,
height = 2000,
background_color = 'black',
stopwords = stopwords).generate(str(text))
fig = plt.figure(
figsize = (40, 30),
facecolor = 'k',
edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()
df_706 = df[df['code']==706]
df_706.head()
text_706 = df_706.descr_clm.values
wordcloud = WordCloud(
width = 3000,
height = 2000,
background_color = 'black',
stopwords = stopwords).generate(str(text_706))
fig = plt.figure(
figsize = (40, 30),
facecolor = 'k',
edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()
from word_cloud.word_cloud_generator import WordCloud
from IPython.core.display import HTML
wc=WordCloud(use_tfidf=True,stopwords=stopwords)
text = sort(text)
#don't randomize color, show only top 500
embed_code=wc.get_embed_code(text=text,random_color=True,topn=500)
HTML(embed_code)
!pip install pyLDAvis
from __future__ import print_function
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
df_705_706 = df[(df['code'] == 705)|(df['code'] == 706)]
df_705_706['code'].unique()
df_705_706['code'].value_counts()
tf_vectorizer_705_706 = CountVectorizer(strip_accents = 'unicode',
stop_words = stopwords,
lowercase = True,
token_pattern = r'\b[a-zA-Z]{3,}\b',
max_df = 0.5,
min_df = 10)
dtm_tf_705_706 = tf_vectorizer_705_706.fit_transform(df_705_706.descr_clm.values)
print(dtm_tf_705_706.shape)
tfidf_vectorizer_705_706 = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf_705_706 = tfidf_vectorizer_705_706.fit_transform(df_705_706.descr_clm.values)
print(dtm_tfidf_705_706.shape)
# for TF DTM
lda_tf_705_706 = LatentDirichletAllocation(n_topics=2, random_state=0)
lda_tf_705_706.fit(dtm_tf_705_706)
# for TFIDF DTM
lda_tfidf_705_706 = LatentDirichletAllocation(n_topics=2, random_state=0)
lda_tfidf_705_706.fit(dtm_tfidf_705_706)
pyLDAvis.sklearn.prepare(lda_tf_705_706, dtm_tf_705_706, tf_vectorizer_705_706)
pyLDAvis.sklearn.prepare(lda_tfidf_705_706, dtm_tfidf_705_706, tfidf_vectorizer_705_706)
tf_vectorizer_705 = CountVectorizer(strip_accents = 'unicode',
stop_words = stopwords,
lowercase = True,
token_pattern = r'\b[a-zA-Z]{3,}\b',
max_df = 0.5,
min_df = 10)
dtm_tf_705 = tf_vectorizer_705.fit_transform(df_705.descr_clm.values)
print(dtm_tf_705.shape)
tfidf_vectorizer_705 = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf_705 = tfidf_vectorizer_705.fit_transform(df_705.descr_clm.values)
print(dtm_tfidf_705.shape)
# for TF DTM
lda_tf_705 = LatentDirichletAllocation(n_topics=5, random_state=0)
lda_tf.fit(dtm_tf_705)
# for TFIDF DTM
lda_tfidf_705 = LatentDirichletAllocation(n_topics=5, random_state=0)
lda_tfidf_705.fit(dtm_tfidf_705)
pyLDAvis.sklearn.prepare(lda_tf_705, dtm_tf_705, tf_vectorizer_705)
pyLDAvis.sklearn.prepare(lda_tfidf_705, dtm_tfidf_705, tfidf_vectorizer_705)
tf_vectorizer_706 = CountVectorizer(strip_accents = 'unicode',
stop_words = stopwords,
lowercase = True,
token_pattern = r'\b[a-zA-Z]{3,}\b',
max_df = 0.5,
min_df = 10)
dtm_tf_706 = tf_vectorizer_706.fit_transform(df_706.descr_clm.values)
print(dtm_tf_706.shape)
tfidf_vectorizer_706 = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf_706 = tfidf_vectorizer_706.fit_transform(df_706.descr_clm.values)
print(dtm_tfidf_706.shape)
# for TF DTM
lda_tf_706 = LatentDirichletAllocation(n_topics=5, random_state=0)
lda_tf_706.fit(dtm_tf_706)
# for TFIDF DTM
lda_tfidf_706 = LatentDirichletAllocation(n_topics=5, random_state=0)
lda_tfidf_706.fit(dtm_tfidf_706)
pyLDAvis.sklearn.prepare(lda_tf_706, dtm_tf_706, tf_vectorizer_706)
pyLDAvis.sklearn.prepare(lda_tfidf_706, dtm_tfidf_706, tfidf_vectorizer_706)